**Create sample (Sample_Unemp_NILF_Selfemp.dta) in LIS*
**Meiying Li 5 Aug 2024*

global datasets "au04 au08 au10 au14 au16 au18 at00 at03 at04 at05 at06 at07 at08 at09 at10 at11 at12 at13 at14 at15 at16 at17 at18 at19 be00 be03 be04 be05 be06 be07 be08 be09 be10 be11 be12 be13 be14 be15 be16 be17 cz02 cz04 cz07 cz10 cz13 cz16 dk00 dk04 dk07 dk10 dk13 dk16 ee00 ee04 ee07 ee10 ee13 ee16 fi00 fi04 fi07 fi10 fi13 fi16  fr99 fr00 fr01 fr02 fr03 fr04 fr05 fr06 fr07 fr08 fr09 fr10 fr11 fr12 fr13 fr14 fr15 fr16 fr17 fr18  de99 de00 de01 de02 de03 de04 de05 de06 de07 de08 de09 de10 de11 de12 de13 de14 de15 de16 de17 de18 gr00 gr04 gr07 gr10 gr13 gr16 hu99 hu05 hu07 hu09 hu12 hu15 is04 is07 is10 ie00 ie02 ie03 ie04 ie05 ie06 ie07 ie08 ie09 ie10 ie11 ie12 ie13 ie14 ie15 ie16 ie17 ie18  it00 it04 it08 it10 it14 it16 jp08 jp10 jp13 lt09 lt10 lt11 lt12 lt13 lt14 lt15 lt16 lt17 lt18 lu00 lu04 lu07 lu10 lu13 nl99 nl04 nl07 nl10 nl13 nl15 nl16 nl17 nl18 no00 no04 no07 no10 no13 no16  pl04 pl05 pl06 pl07 pl08 pl09 pl10 pl11 pl12 pl13 pl14 pl15 pl16 pl17 pl18 pl19 sk04 sk07 sk10 sk13 sk14 sk15 sk16 sk17 sk18 si99 si04 si07 si10 si12 si15 kr06 kr08 kr10 kr12 kr14 kr16 es00 es04 es07 es10 es13 es16 se00 se02 se03 se04 se05 se06 se07 se08 se09 se10 se11  se13 se14 se15 se16 se17 se18 se19 ch00 ch02 ch04 ch06 ch07 ch08 ch09 ch10 ch11 ch12 ch13 ch14 ch15 ch16 ch17 ch18 uk99 uk00 uk01 uk02 uk03 uk04 uk05 uk06 uk07 uk08 uk09 uk10 uk11 uk12 uk13 uk14 uk15 uk16 uk17 uk18  us99 us00 us01 us02 us03 us04 us05 us06 us07 us08 us09 us10 us11 us12 us13 us14 us15 us16 us17 us18 us19"  

// exclude pl99 because all pilabour equals to 0
//remove au01 au03 se12 no non-NA educ 


*********************** Get the main variables 

global varshh "hid own nearn nhhmem17 hitransfer hilabour "
global varspp "hid pid  did dname cname  year pilabour ppopwgt  age sex marital educlev educ  emp  lfs fyft hourstot weeks public1    status1 inda1 indb1 indc1 indd1 ind1_c occa1 occb1 occ1_c  ptime1 hours1 net1    gross1    pwgt iso2 relation partner  nchildren ageyoch wexptl  pi11 immigr  temp1 pi411 "       


      
      
program define make_data         
foreach ccyy in $datasets {         
use $varspp using $`ccyy'p,clear         
merge m:1 hid using $`ccyy'h, keepusing($varshh)
gen hhmem5 = inrange(age,0,5) if !missing(age)
by hid, sort: egen nhhmem5=sum(hhmem5) if !missing(hhmem5)
* generate the youngest person in the household to fill the missing of ageoych
by hid, sort: egen ageyoHH = min(age)
keep if inrange(age,25,44)  
gen Female = 0 if sex != .
replace Female = 1 if sex == 2
gen partnered = 0 if marital != .
replace partnered=1 if marital <200 & marital != .
gen age2 = age^2
gen othemp = 0
replace othemp = 1 if nearn-emp > 0 
replace othemp =. if nearn==. | emp ==.
gen tertiary = 0 if educ !=.
replace tertiary = 1 if educ==3  & educ !=.
gen workhours = hourstot
replace workhours = hours1 if hourstot==.
gen fulltime = fyft 
replace fulltime = 1 if fyft ==. & workhours!=. & workhours >=30
replace fulltime = 0 if  fyft ==. & workhours!=. & workhours < 30
*recode missing data for nchildren; most missing cases for nchildren are nhhmem17==0*
replace nchildren = nhhmem17 if nchildren==.
*ageyoch 0 is child <12 mos, those w/o children are missing (.) as well as any truly missing, separate using nchildren and replace only those who are parents with ageyoHH
recode ageyoch (.= -99)
recode ageyoch (-99 = -88) if nchildren>=1
replace ageyoch = ageyoHH if ageyoch== -88
*note non-parents are now coded to -99 on ageyoch; any remaining missing for parents is -88
*recode respondents with youngest children over age 21 to nochildren*
replace nchildren=0 if ageyoch >=21
recode nchildren (0=0)(1/18=1), gen(parent)
recode ageyoch (0/5=1)(6/90=0)(-99=0)(-88=.), gen(ykid)
**** Heckman selection, create inverse mills-ratio for each country-year. 
*** Budig, M. J., Misra, J., & Boeckmann, I. (2012). The motherhood penalty in cross-national perspective: The importance of work–family policies and cultural attitudes. Social Politics, 19(2), 163-193.
*** Use 1) logged transfer income, 2) logged other household income (total household wage - personal wage), 3) presence of a preschooler to predict the probably of employment 
*** Notes: transfer income and other household income are not adjusted using ppp, but it's OK, because mills-ratio is calculated for each country-year separately; 
*** Handle missing of transfer income and other household income: treat transfer and household income missing as 0, 
*** but keep personal income as missing if pilabour ==. or not employed
*** this is to make sure the sample does not change when adding Heckman selection in the model 
gen transfer = log(hitransfer) 
replace transfer = 0 if transfer==. 
gen HHincome = log(hilabour)
replace HHincome = 0 if HHincome ==.  
gen Income = log(pilabour) if pilabour!=.  & emp!=.
replace Income = 0 if Income ==. & pilabour!=.  & emp!=.
gen otherincome = HHincome - Income
gen missing_heckman = 0
replace missing_heckman = 1 if emp ==. |Female ==. | ykid ==. |  transfer ==. | otherincome ==.| educ==.
if "`ccyy'" != "au04" {           
append using $mydata/try_meiyli2       
}           
save $mydata/try_meiyli2, replace           
}               
end               
       
quietly make_data        

save $mydata/try_meiyli2, replace 


*********************** Get the samples
***** missingness 
gen missing = 0
replace missing = 1 if emp ==. | pilabour ==. |  Female==. | partnered==. | tertiary ==. | nhhmem17==. | othemp ==. | ykid==.

gen missing_fulltime = 0
replace missing_fulltime = 1 if missing ==1 | fulltime ==. 

gen missing_workhour = 0
replace missing_workhour = 1 if missing ==1 | workhours ==. 

gen missing_workhour_weeks = 0
replace missing_workhour_weeks = 1 if missing ==1 | workhours ==. | weeks==.

gen missing_occ = 0
replace missing_occ = 1 if missing ==1 | occb1 ==. 

gen missing_public = 0
replace missing_public = 1 if missing ==1 | public1 ==. 

gen missing_exp = 0
replace missing_exp = 1 if missing ==1 | wexptl ==. 


******** restrict the sample to employees
gen Sample_emp_CY = 0 
replace Sample_emp_CY=1 if emp == 1 & status1 < 200  

**** exclude country-years that have more than 7.1% missingness by only including the employees
bysort dname: egen Mean_missing =mean(missing) if Sample_emp_CY ==1
** fill the missingness of non-employees using the missingness of employees
bysort dname (Mean_missing): replace Mean_missing = Mean_missing[_n-1] if missing(Mean_missing) 
bysort dname (Mean_missing): replace Mean_missing = Mean_missing[_n+1] if missing(Mean_missing)

keep if Mean_missing <= 0.071
*** exclude countries that only appear once
egen tag = tag(dname cname) 
egen N_CY = total(tag), by(cname)
drop if N_CY <= 1 
drop N_CY  tag
// no country is removed
**** exclude missingness for heckman selection (mills ratio)
drop if missing_heckman == 1
gen  Sample_all_heck = 1
gen Sample_all_emp = 0
replace Sample_all_emp = 1 if emp == 1 & status1 < 200   & missing == 0 


*Sample: add the full-time variable
***** exclude country-years that adds 6% or more missingeness to the all employees sample.
bysort dname: egen Mean_missingfulltime =mean(missing_fulltime)  if Sample_emp_CY ==1 & missing ==0 
bysort dname (Mean_missingfulltime): replace Mean_missingfulltime = Mean_missingfulltime[_n-1] if missing(Mean_missingfulltime) 
bysort dname (Mean_missingfulltime): replace Mean_missingfulltime = Mean_missingfulltime[_n+1] if missing(Mean_missingfulltime)
gen Sample_fulltime_heck = 0 
replace Sample_fulltime_heck = 1 if Mean_missingfulltime <= 0.06 
gen Sample_fulltime_emp = 0 
replace Sample_fulltime_emp = 1 if Mean_missingfulltime <= 0.06 &  missing_fulltime ==0 & Sample_emp_CY ==1
*** exclude countries that only appear once
egen tag = tag(dname cname) if Sample_fulltime_emp == 1 
egen N_CY = total(tag), by(cname), if Sample_fulltime_emp == 1 
bysort dname (N_CY): replace N_CY = N_CY[_n-1] if missing(N_CY) 
bysort dname (N_CY): replace N_CY = N_CY[_n+1] if missing(N_CY)
replace Sample_fulltime_heck = 0 if N_CY <= 1 
replace Sample_fulltime_emp = 0 if N_CY <= 1 
drop N_CY  tag

*Sample: add the work hour variable
*** same procedure with the full-time sample
bysort dname: egen Mean_missingWH =mean(missing_workhour) if Sample_emp_CY ==1 & missing ==0 
bysort dname (Mean_missingWH): replace Mean_missingWH = Mean_missingWH[_n-1] if missing(Mean_missingWH) 
bysort dname (Mean_missingWH): replace Mean_missingWH = Mean_missingWH[_n+1] if missing(Mean_missingWH)
gen Sample_WH_heck = 0 
replace Sample_WH_heck = 1 if Mean_missingWH <=0.06 
gen Sample_WH_emp = 0 
replace Sample_WH_emp = 1 if Mean_missingWH <=0.06 &  missing_workhour ==0 & Sample_emp_CY ==1
*** exclude countries that only appear once
egen tag = tag(dname cname) if Sample_WH_emp == 1 
egen N_CY = total(tag), by(cname), if Sample_WH_emp == 1 
bysort dname (N_CY): replace N_CY = N_CY[_n-1] if missing(N_CY) 
bysort dname (N_CY): replace N_CY = N_CY[_n+1] if missing(N_CY)
replace Sample_WH_emp = 0 if N_CY <= 1 
replace Sample_WH_heck = 0 if N_CY <= 1 
drop N_CY  tag

*Sample: add the weeks variable to the work hour sample
*** same procedure with the full-time sample
bysort dname: egen Mean_missingWH_weeks =mean(missing_workhour_weeks) if Sample_emp_CY ==1 & missing ==0 
bysort dname (Mean_missingWH_weeks): replace Mean_missingWH_weeks = Mean_missingWH_weeks[_n-1] if missing(Mean_missingWH_weeks) 
bysort dname (Mean_missingWH_weeks): replace Mean_missingWH_weeks = Mean_missingWH_weeks[_n+1] if missing(Mean_missingWH_weeks)
gen Sample_WH_weeks_heck = 0 
replace Sample_WH_weeks_heck = 1 if Mean_missingWH_weeks <=0.06 
gen Sample_WH_weeks_emp = 0 
replace Sample_WH_weeks_emp = 1 if Mean_missingWH_weeks <=0.06 &  missing_workhour_weeks ==0 & Sample_emp_CY ==1
*** exclude countries that only appear once
egen tag = tag(dname cname) if Sample_WH_weeks_emp == 1 
egen N_CY = total(tag), by(cname), if Sample_WH_weeks_emp == 1 
bysort dname (N_CY): replace N_CY = N_CY[_n-1] if missing(N_CY) 
bysort dname (N_CY): replace N_CY = N_CY[_n+1] if missing(N_CY)
replace Sample_WH_weeks_emp = 0 if N_CY <= 1 
replace Sample_WH_weeks_heck = 0 if N_CY <= 1 
drop N_CY  tag


*Sample: add the occupation variable
*** same procedure with the full-time sample
bysort dname: egen Mean_missingOCC =mean(missing_occ) if Sample_emp_CY ==1 & missing ==0 
bysort dname (Mean_missingOCC): replace Mean_missingOCC = Mean_missingOCC[_n-1] if missing(Mean_missingOCC) 
bysort dname (Mean_missingOCC): replace Mean_missingOCC = Mean_missingOCC[_n+1] if missing(Mean_missingOCC)
gen Sample_OCC_heck = 0 
replace Sample_OCC_heck = 1 if Mean_missingOCC <=0.06 
gen Sample_OCC_emp = 0 
replace Sample_OCC_emp = 1 if Mean_missingOCC <=0.06 &  missing_occ ==0 & Sample_emp_CY ==1
*** exclude countries that only appear once
egen tag = tag(dname cname) if Sample_OCC_emp == 1 
egen N_CY = total(tag), by(cname), if Sample_OCC_emp == 1 
bysort dname (N_CY): replace N_CY = N_CY[_n-1] if missing(N_CY) 
bysort dname (N_CY): replace N_CY = N_CY[_n+1] if missing(N_CY)
replace Sample_OCC_heck = 0 if N_CY <= 1 
replace Sample_OCC_emp = 0 if N_CY <= 1 
drop N_CY tag


*Sample: add the public sector variable
*** same procedure with the full-time sample
bysort dname: egen Mean_missingPub =mean(missing_public) if Sample_emp_CY ==1 & missing ==0 
bysort dname (Mean_missingPub): replace Mean_missingPub = Mean_missingPub[_n-1] if missing(Mean_missingPub) 
bysort dname (Mean_missingPub): replace Mean_missingPub = Mean_missingPub[_n+1] if missing(Mean_missingPub)
gen Sample_Public_heck = 0 
replace Sample_Public_heck = 1 if Mean_missingPub <=0.06 
gen Sample_Public_emp = 0 
replace Sample_Public_emp = 1 if Mean_missingPub <=0.06 &  missing_public ==0 & Sample_emp_CY ==1
*** exclude countries that only appear once
egen tag = tag(dname cname) if Sample_Public_emp == 1 
egen N_CY = total(tag), by(cname), if Sample_Public_emp == 1 
bysort dname (N_CY): replace N_CY = N_CY[_n-1] if missing(N_CY) 
bysort dname (N_CY): replace N_CY = N_CY[_n+1] if missing(N_CY)
replace Sample_Public_heck = 0 if N_CY <= 1 
replace Sample_Public_emp = 0 if N_CY <= 1 
drop N_CY  tag


*Sample: add the work experience variable
*** same procedure with the full-time sample
bysort dname: egen Mean_missingEXP =mean(missing_exp) if Sample_emp_CY ==1 & missing ==0 
bysort dname (Mean_missingEXP): replace Mean_missingEXP = Mean_missingEXP[_n-1] if missing(Mean_missingEXP) 
bysort dname (Mean_missingEXP): replace Mean_missingEXP = Mean_missingEXP[_n+1] if missing(Mean_missingEXP)
gen Sample_EXP_heck = 0 
replace Sample_EXP_heck = 1 if Mean_missingEXP <=0.06 
gen Sample_EXP_emp = 0 
replace Sample_EXP_emp = 1 if Mean_missingEXP <=0.06 &  missing_exp ==0 & Sample_emp_CY ==1
*** exclude countries that only appear once
egen tag = tag(dname cname) if Sample_EXP_emp == 1 
egen N_CY = total(tag), by(cname), if Sample_EXP_emp == 1 
bysort dname (N_CY): replace N_CY = N_CY[_n-1] if missing(N_CY) 
bysort dname (N_CY): replace N_CY = N_CY[_n+1] if missing(N_CY)
replace Sample_EXP_heck = 0 if N_CY <= 1 
replace Sample_EXP_emp = 0 if N_CY <= 1 
drop N_CY  tag

save $mydata/meiyli/Sample_Unemp_NILF_Selfemp.dta, replace 

